This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(caret)
library(data.table)
library(plotly)
# attach iris
data(iris)

dt <- iris
# obtain 80% of the data set for training
validation_index <- createDataPartition(dt$Species, p = 0.8, list = FALSE)
# select 20% for validation and 80% for training
dtValidation <- dt[-validation_index, ] %>% data.table()
dtTest <- dt[validation_index, ] %>% data.table()

Summarize data:

# dimensions
dim(dtTest)
[1] 120   5
# list attributes
sapply(dt, class)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
   "numeric"    "numeric"    "numeric"    "numeric"     "factor" 
# peek at dataset
head(dt)
# list the levels for the class
levels(dt$Species)
[1] "setosa"     "versicolor" "virginica" 
# class distribution of test data
percentage <- prop.table(table(dtTest$Species))*100
cbind(freq = table(dtTest$Species), percentage = percentage)
           freq percentage
setosa       40   33.33333
versicolor   40   33.33333
virginica    40   33.33333
# statistical summary
summary(dtTest)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species  
 Min.   :4.300   Min.   :2.200   Min.   :1.000   Min.   :0.100   setosa    :40  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:40  
 Median :5.750   Median :3.000   Median :4.400   Median :1.350   virginica :40  
 Mean   :5.826   Mean   :3.057   Mean   :3.758   Mean   :1.211                  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
 Max.   :7.900   Max.   :4.400   Max.   :6.700   Max.   :2.500                  

Visualize Dataset

# univariate plots: boxplot of each individual variable
dtPlot <- data.table(dtTest) %>% melt.data.table(, id.vars = "Species", 
                      measure.vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"), 
                      variable.name = "Type", value.name = "Value")
dtPlot$Normalized <- (dtPlot$Value - min(dtPlot$Value))/(max(dtPlot$Value) - min(dtPlot$Value))
plot_ly(dtPlot, y = ~Normalized, color = ~Type, type = "box", boxpoints = "all", jitter = 0.3)
# bar plot of class breakdown
plot_ly(dtTest[, .N, keyby = Species], y = ~N, x = ~Species, type = "bar")
# multivariate plots: feature plot of attributes and color by class, box plot for each attribute by species

Evaluate Some Algorithms

  1. Set-up the test harness to use 10-fold cross validation
  1. Build 5 different models to predict species from flower measurements
  1. Select the best model

Test Harness

control <- trainControl(method = "cv", number = 10)
Warning messages:
1: package ‘ggplot2’ was built under R version 3.6.3 
2: package ‘caret’ was built under R version 3.6.3 
3: package ‘data.table’ was built under R version 3.6.3 
4: package ‘plotly’ was built under R version 3.6.3 
metric <- "Accuracy"

Build models

# Linear Discriminant Analysis (LDA)
set.seed(7)
fit.lda <- train(Species ~ ., data = dtTest, method = "lda", metric = metric, trControl = control)

# Classification and Regression Trees (CART)
set.seed(7)
fit.cart <- train(Species ~ ., data = dtTest, method = "rpart", metric = metric, trControl = control)

# k-Nearest Neighbors (kNN)
set.seed(7)
fit.knn <- train(Species ~ ., data = dtTest, method = "knn", metric = metric, trControl = control)

# Support Vector Machines (SVM) with a linear kernel
set.seed(7)
fit.svm <- train(Species ~ ., data = dtTest, method = "svmRadial", metric = metric, trControl = control)
1 package is needed for this model and is not installed. (kernlab). Would you like to try to install it now?
1: yes
2: no
yes
Installing package into ‘/home/jabagat/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)
trying URL 'https://rstudiopm.santeecooper.com/prod-cran/__linux__/centos7/latest/src/contrib/kernlab_0.9-29.tar.gz'
Content type 'application/x-gzip' length 2457679 bytes (2.3 MB)
==================================================
downloaded 2.3 MB

* installing *binary* package ‘kernlab’ ...
* DONE (kernlab)

The downloaded source packages are in
    ‘/tmp/Rtmp2VujRU/downloaded_packages’
# Random Forest (RF)
set.seed(7)
fit.rf <- train(Species ~ ., data = dtTest, method = "rf", metric = metric, trControl = control)
1 package is needed for this model and is not installed. (randomForest). Would you like to try to install it now?
1: yes
2: no
yes
Installing package into ‘/home/jabagat/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)
trying URL 'https://rstudiopm.santeecooper.com/prod-cran/__linux__/centos7/latest/src/contrib/randomForest_4.6-14.tar.gz'
Content type 'application/x-gzip' length 263356 bytes (257 KB)
==================================================
downloaded 257 KB

* installing *binary* package ‘randomForest’ ...
* DONE (randomForest)

The downloaded source packages are in
    ‘/tmp/Rtmp2VujRU/downloaded_packages’

Select the best model

# sumarize accuracy of the models
results <- resamples(list(lda = fit.lda, cart = fit.cart, knn = fit.knn, svm = fit.svm, rf = fit.rf))
summary(results)

Call:
summary.resamples(object = results)

Models: lda, cart, knn, svm, rf 
Number of resamples: 10 

Accuracy 
          Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
lda  0.9166667 1.0000000 1.0000000 0.9916667       1    1    0
cart 0.8333333 0.9375000 1.0000000 0.9583333       1    1    0
knn  0.9166667 0.9375000 1.0000000 0.9750000       1    1    0
svm  0.9166667 0.9166667 0.9583333 0.9583333       1    1    0
rf   0.8333333 0.9375000 1.0000000 0.9666667       1    1    0

Kappa 
      Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
lda  0.875 1.00000 1.0000 0.9875       1    1    0
cart 0.750 0.90625 1.0000 0.9375       1    1    0
knn  0.875 0.90625 1.0000 0.9625       1    1    0
svm  0.875 0.87500 0.9375 0.9375       1    1    0
rf   0.750 0.90625 1.0000 0.9500       1    1    0
# compare the accuracy of the models
dotplot(results)

# summarize the best model
print(fit.lda)
Linear Discriminant Analysis 

120 samples
  4 predictor
  3 classes: 'setosa', 'versicolor', 'virginica' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
Resampling results:

  Accuracy   Kappa 
  0.9916667  0.9875

Make Predictions

# estimate skill of LDA
predictions <- predict(fit.lda, dtValidation)
confusionMatrix(predictions, dtValidation$Species)
Confusion Matrix and Statistics

            Reference
Prediction   setosa versicolor virginica
  setosa         10          0         0
  versicolor      0          9         1
  virginica       0          1         9

Overall Statistics
                                          
               Accuracy : 0.9333          
                 95% CI : (0.7793, 0.9918)
    No Information Rate : 0.3333          
    P-Value [Acc > NIR] : 8.747e-12       
                                          
                  Kappa : 0.9             
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: setosa Class: versicolor Class: virginica
Sensitivity                 1.0000            0.9000           0.9000
Specificity                 1.0000            0.9500           0.9500
Pos Pred Value              1.0000            0.9000           0.9000
Neg Pred Value              1.0000            0.9500           0.9500
Prevalence                  0.3333            0.3333           0.3333
Detection Rate              0.3333            0.3000           0.3000
Detection Prevalence        0.3333            0.3333           0.3333
Balanced Accuracy           1.0000            0.9250           0.9250
LS0tCnRpdGxlOiAiUiBNTCBFeGVyY2lzZSAtIGlyaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KClRoaXMgaXMgYW4gW1IgTWFya2Rvd25dKGh0dHA6Ly9ybWFya2Rvd24ucnN0dWRpby5jb20pIE5vdGVib29rLiBXaGVuIHlvdSBleGVjdXRlIGNvZGUgd2l0aGluIHRoZSBub3RlYm9vaywgdGhlIHJlc3VsdHMgYXBwZWFyIGJlbmVhdGggdGhlIGNvZGUuIAoKVHJ5IGV4ZWN1dGluZyB0aGlzIGNodW5rIGJ5IGNsaWNraW5nIHRoZSAqUnVuKiBidXR0b24gd2l0aGluIHRoZSBjaHVuayBvciBieSBwbGFjaW5nIHlvdXIgY3Vyc29yIGluc2lkZSBpdCBhbmQgcHJlc3NpbmcgKkN0cmwrU2hpZnQrRW50ZXIqLiAKCmBgYHtyLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KGNhcmV0KQpsaWJyYXJ5KGRhdGEudGFibGUpCmxpYnJhcnkocGxvdGx5KQpgYGAKCgpgYGB7ciwgY2FjaGU9VFJVRX0KIyBhdHRhY2ggaXJpcwpkYXRhKGlyaXMpCgpkdCA8LSBpcmlzCmBgYAoKYGBge3J9CiMgb2J0YWluIDgwJSBvZiB0aGUgZGF0YSBzZXQgZm9yIHRyYWluaW5nCnZhbGlkYXRpb25faW5kZXggPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihkdCRTcGVjaWVzLCBwID0gMC44LCBsaXN0ID0gRkFMU0UpCiMgc2VsZWN0IDIwJSBmb3IgdmFsaWRhdGlvbiBhbmQgODAlIGZvciB0cmFpbmluZwpkdFZhbGlkYXRpb24gPC0gZHRbLXZhbGlkYXRpb25faW5kZXgsIF0gJT4lIGRhdGEudGFibGUoKQpkdFRlc3QgPC0gZHRbdmFsaWRhdGlvbl9pbmRleCwgXSAlPiUgZGF0YS50YWJsZSgpCmBgYAoKIyBTdW1tYXJpemUgZGF0YTogCi0gZGltZW5zaW9ucwotIHR5cGVzIG9mIGF0dHJpYnV0ZXMKLSBwZWFrIGF0IHRoZSBkYXRhCi0gbGV2ZWxzIG9mIGNsYXNzZXMsIAotIGJyZWFrZG93biBvZiB0aGUgaW5zdGFuY2VzIGluIGVhY2ggY2xhc3MKLSBzdGF0aXN0aWNhbCBzdW1tYXJ5IG9mIGFsbCBhdHRyaWJ1dGVzCgpgYGB7cn0KIyBkaW1lbnNpb25zCmRpbShkdFRlc3QpCmBgYAoKCmBgYHtyfQojIGxpc3QgYXR0cmlidXRlcwpzYXBwbHkoZHQsIGNsYXNzKQpgYGAKCgpgYGB7cn0KIyBwZWVrIGF0IGRhdGFzZXQKaGVhZChkdCkKYGBgCgoKYGBge3J9CiMgbGlzdCB0aGUgbGV2ZWxzIGZvciB0aGUgY2xhc3MKbGV2ZWxzKGR0JFNwZWNpZXMpCmBgYAoKYGBge3J9CiMgY2xhc3MgZGlzdHJpYnV0aW9uIG9mIHRlc3QgZGF0YQpwZXJjZW50YWdlIDwtIHByb3AudGFibGUodGFibGUoZHRUZXN0JFNwZWNpZXMpKSoxMDAKY2JpbmQoZnJlcSA9IHRhYmxlKGR0VGVzdCRTcGVjaWVzKSwgcGVyY2VudGFnZSA9IHBlcmNlbnRhZ2UpCmBgYAoKYGBge3J9CiMgc3RhdGlzdGljYWwgc3VtbWFyeQpzdW1tYXJ5KGR0VGVzdCkKYGBgCgojIFZpc3VhbGl6ZSBEYXRhc2V0CmBgYHtyfQojIHVuaXZhcmlhdGUgcGxvdHM6IGJveHBsb3Qgb2YgZWFjaCBpbmRpdmlkdWFsIHZhcmlhYmxlCmR0UGxvdCA8LSBtZWx0LmRhdGEudGFibGUoZHRUZXN0LCBpZC52YXJzID0gIlNwZWNpZXMiLHZhcmlhYmxlLm5hbWUgPSAiVHlwZSIsIHZhbHVlLm5hbWUgPSAiVmFsdWUiLCAKICAgICAgICAgICAgICAgICAgICAgICAgICBtZWFzdXJlLnZhcnMgPSBjKCJTZXBhbC5MZW5ndGgiLCAiU2VwYWwuV2lkdGgiLCAiUGV0YWwuTGVuZ3RoIiwgIlBldGFsLldpZHRoIikpCmR0UGxvdCROb3JtYWxpemVkIDwtIChkdFBsb3QkVmFsdWUgLSBtaW4oZHRQbG90JFZhbHVlKSkvKG1heChkdFBsb3QkVmFsdWUpIC0gbWluKGR0UGxvdCRWYWx1ZSkpCnBsb3RfbHkoZHRQbG90LCB5ID0gfk5vcm1hbGl6ZWQsIGNvbG9yID0gflR5cGUsIHR5cGUgPSAiYm94IiwgYm94cG9pbnRzID0gImFsbCIsIGppdHRlciA9IDAuMykKYGBgCgpgYGB7cn0KIyBiYXIgcGxvdCBvZiBjbGFzcyBicmVha2Rvd24KcGxvdF9seShkdFRlc3RbLCAuTiwga2V5YnkgPSBTcGVjaWVzXSwgeSA9IH5OLCB4ID0gflNwZWNpZXMsIHR5cGUgPSAiYmFyIikKYGBgCgpgYGB7cn0KIyBtdWx0aXZhcmlhdGUgcGxvdHM6IGZlYXR1cmUgcGxvdCBvZiBhdHRyaWJ1dGVzIGFuZCBjb2xvciBieSBjbGFzcywgYm94IHBsb3QgZm9yIGVhY2ggYXR0cmlidXRlIGJ5IHNwZWNpZXMKYGBgCgojIEV2YWx1YXRlIFNvbWUgQWxnb3JpdGhtcwoxLiBTZXQtdXAgdGhlIHRlc3QgaGFybmVzcyB0byB1c2UgMTAtZm9sZCBjcm9zcyB2YWxpZGF0aW9uCiAgLSBUaGlzIHdpbGwgc3BsaXQgb3VyIGRhdGFzZXQgaW50byAxMCBwYXJ0cywgdHJhaW4gaW4gOSBhbmQgdGVzdCBvbiAxIGFuZCByZWxlYXNlIGZvciBhbGwgY29tYmluYXRpb25zIG9mIHRyYWluLXRlc3Qgc3BsaXRzLiBXZSB3aWxsIGFsc28gcmVwZWF0IHRoZSBwcm9jZXNzIDMgdGltZXMgZm9yIGVhY2ggYWxnb3JpdGhtIHdpdGggZGlmZmVyZW50IHNwbGl0cyBvZiB0aGUgZGF0YSBpbnRvIDEwIGdyb3VwcywgaW4gYW4gZWZmb3J0IHRvIGdldCBhIG1vcmUgYWNjdXJhdGUgZXN0aW1hdGUuCiAgLSB0aGUgbWV0cmljIG9mIOKAnEFjY3VyYWN54oCdIHRvIGV2YWx1YXRlIG1vZGVscy4gCiAgICAtIFRoaXMgaXMgYSByYXRpbyBvZiB0aGUgbnVtYmVyIG9mIGNvcnJlY3RseSBwcmVkaWN0ZWQgaW5zdGFuY2VzIGluIGRpdmlkZWQgYnkgdGhlIHRvdGFsIG51bWJlciBvZiBpbnN0YW5jZXMgaW4gdGhlIGRhdGFzZXQgbXVsdGlwbGllZCBieSAxMDAgdG8gZ2l2ZSBhIHBlcmNlbnRhZ2UgKGUuZy4gOTUlIGFjY3VyYXRlKS4gCiAgICAtIFdlIHdpbGwgYmUgdXNpbmcgdGhlIG1ldHJpYyB2YXJpYWJsZSB3aGVuIHdlIHJ1biBidWlsZCBhbmQgZXZhbHVhdGUgZWFjaCBtb2RlbCBuZXh0LgoyLiBCdWlsZCA1IGRpZmZlcmVudCBtb2RlbHMgdG8gcHJlZGljdCBzcGVjaWVzIGZyb20gZmxvd2VyIG1lYXN1cmVtZW50cwogIC0gTGluZWFyIERpc2NyaW1pbmFudCBBbmFseXNpcyAoTERBKTsgc2ltcGxlIGxpbmVhcgogIC0gQ2xhc3NpZmljYXRpb24gYW5kIFJlZ3Jlc3Npb24gVHJlZXMgKENBUlQpOyBub25saW5lYXIKICAtIGstTmVhcmVzdCBOZWlnaGJvcnMgKGtOTik7IG5vbmxpbmVhcgogIC0gU3VwcG9ydCBWZWN0b3IgTWFjaGluZXMgKFNWTSkgd2l0aCBhIGxpbmVhciBrZXJuZWw7IGNvbXBsZXggbGluZWFyCiAgLSBSYW5kb20gRm9yZXN0IChSRik7IGNvbXBsZXggbGluZWFyCjMuIFNlbGVjdCB0aGUgYmVzdCBtb2RlbAoKIyMgVGVzdCBIYXJuZXNzCmBgYHtyfQpjb250cm9sIDwtIHRyYWluQ29udHJvbChtZXRob2QgPSAiY3YiLCBudW1iZXIgPSAxMCkKbWV0cmljIDwtICJBY2N1cmFjeSIKYGBgCgojIyBCdWlsZCBtb2RlbHMKYGBge3J9CiMgTGluZWFyIERpc2NyaW1pbmFudCBBbmFseXNpcyAoTERBKQpzZXQuc2VlZCg3KQpmaXQubGRhIDwtIHRyYWluKFNwZWNpZXMgfiAuLCBkYXRhID0gZHRUZXN0LCBtZXRob2QgPSAibGRhIiwgbWV0cmljID0gbWV0cmljLCB0ckNvbnRyb2wgPSBjb250cm9sKQoKIyBDbGFzc2lmaWNhdGlvbiBhbmQgUmVncmVzc2lvbiBUcmVlcyAoQ0FSVCkKc2V0LnNlZWQoNykKZml0LmNhcnQgPC0gdHJhaW4oU3BlY2llcyB+IC4sIGRhdGEgPSBkdFRlc3QsIG1ldGhvZCA9ICJycGFydCIsIG1ldHJpYyA9IG1ldHJpYywgdHJDb250cm9sID0gY29udHJvbCkKCiMgay1OZWFyZXN0IE5laWdoYm9ycyAoa05OKQpzZXQuc2VlZCg3KQpmaXQua25uIDwtIHRyYWluKFNwZWNpZXMgfiAuLCBkYXRhID0gZHRUZXN0LCBtZXRob2QgPSAia25uIiwgbWV0cmljID0gbWV0cmljLCB0ckNvbnRyb2wgPSBjb250cm9sKQoKIyBTdXBwb3J0IFZlY3RvciBNYWNoaW5lcyAoU1ZNKSB3aXRoIGEgbGluZWFyIGtlcm5lbApzZXQuc2VlZCg3KQpmaXQuc3ZtIDwtIHRyYWluKFNwZWNpZXMgfiAuLCBkYXRhID0gZHRUZXN0LCBtZXRob2QgPSAic3ZtUmFkaWFsIiwgbWV0cmljID0gbWV0cmljLCB0ckNvbnRyb2wgPSBjb250cm9sKQoKIyBSYW5kb20gRm9yZXN0IChSRikKc2V0LnNlZWQoNykKZml0LnJmIDwtIHRyYWluKFNwZWNpZXMgfiAuLCBkYXRhID0gZHRUZXN0LCBtZXRob2QgPSAicmYiLCBtZXRyaWMgPSBtZXRyaWMsIHRyQ29udHJvbCA9IGNvbnRyb2wpCmBgYAoKIyMgU2VsZWN0IHRoZSBiZXN0IG1vZGVsCgpgYGB7cn0KIyBzdW1hcml6ZSBhY2N1cmFjeSBvZiB0aGUgbW9kZWxzCnJlc3VsdHMgPC0gcmVzYW1wbGVzKGxpc3QobGRhID0gZml0LmxkYSwgY2FydCA9IGZpdC5jYXJ0LCBrbm4gPSBmaXQua25uLCBzdm0gPSBmaXQuc3ZtLCByZiA9IGZpdC5yZikpCnN1bW1hcnkocmVzdWx0cykKYGBgCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBhY2N1cmFjeSBvZiB0aGUgbW9kZWxzCmRvdHBsb3QocmVzdWx0cykKYGBgCgpgYGB7cn0KIyBzdW1tYXJpemUgdGhlIGJlc3QgbW9kZWwKcHJpbnQoZml0LmxkYSkKYGBgCgojIE1ha2UgUHJlZGljdGlvbnMKLSBzaW5jZSBMREEgd2FzIHRoZSBtb3N0IGFjY3VyYXRlLCBydW4gTERBIG9uIHRoZSB2YWxpZGF0aW9uIApgYGB7cn0KIyBlc3RpbWF0ZSBza2lsbCBvZiBMREEKcHJlZGljdGlvbnMgPC0gcHJlZGljdChmaXQubGRhLCBkdFZhbGlkYXRpb24pCmNvbmZ1c2lvbk1hdHJpeChwcmVkaWN0aW9ucywgZHRWYWxpZGF0aW9uJFNwZWNpZXMpCmBgYAoKCg==